suppressPackageStartupMessages({
library(package = "knitr")
library(package = "Biobase")
library(package = "RColorBrewer")
library(package = "tidyverse")
library(package = "ggbeeswarm")
library(package = "pvca") # dleelab/pvca
library(package = "lme4")
library(package = "cluster")
library(package = "ComplexHeatmap") # Bioc Install
library(package = "ImmuneSignatures2")
library(package = "cowplot")
library(here)
})
These figures only look at the “Young” cohort that is defined as 18 to 50 years old and the “Extended Old” cohort of 50 to 90 years old. The “Old” cohort (not used here) was 60 to 90 years old.
subsetToCompleteGenes <- function(eset) {
# only include complete cases
# TODO: Check in with IS2 folks about this... SHould I update the datsets?
completeGenes <- complete.cases(exprs(eset))
eset[completeGenes]
}
noRespGEFile <- file.path(dataCacheDir, paste0(datePrefix,"extendedOld_norm_noResponse_eset.rds"))
old.NoResp <- subsetToCompleteGenes(readRDS(file = noRespGEFile))
noRespGEFile <- file.path(dataCacheDir, paste0(datePrefix,"young_norm_noResponse_eset.rds"))
young.NoResp <- subsetToCompleteGenes(readRDS(file = noRespGEFile))
withRespGEFile <- file.path(dataCacheDir, paste0(datePrefix,"young_norm_withResponse_eset.rds"))
young.WithResp <- subsetToCompleteGenes(readRDS(file = withRespGEFile))
Authors: Slim Fourati, Joanna Arce
# any sampling data after 20 days coded as >20
df.fig1b <- pData(young.NoResp) %>%
select(uid, study_time_collected, pathogen, vaccine_type) %>%
arrange(study_time_collected) %>%
mutate(study_time_collected = signif(study_time_collected, digits = 3),
study_time_collected.factor = ifelse(test = study_time_collected > 20,
yes = ">20",
no = study_time_collected),
study_time_collected.factor =
factor(study_time_collected.factor,
levels = unique(study_time_collected.factor)),
vaccine = paste0(pathogen, " (", vaccine_type, ")")) %>%
group_by(study_time_collected.factor, vaccine) %>%
summarize(n = n())
ggplot(data = df.fig1b,
mapping = aes(x = study_time_collected.factor,
y = n)) +
geom_bar(stat = "identity", mapping = aes(fill = vaccine)) +
labs(x = "Days post-vaccination", y = "Number of samples") +
scale_y_continuous(limits = c(0, 800),
breaks = seq(from = 0, to = 800, by = 100)) +
scale_fill_manual(name = "Pathogen (Vaccine type)",
values = vaccine2color) +
theme_bw() +
theme(panel.grid = element_blank(),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.pos = "bottom",
legend.text = element_text(size = 7),
legend.key.height = unit(0.02, units = "npc"),
legend.key.width = unit(0.015, units = "npc")) +
guides(fill = guide_legend(ncol = 2))
Author: Slim Fourati
df.fig1c <- pData(young.NoResp) %>%
select(participant_id, pathogen, vaccine_type, age_imputed, y_chrom_present) %>%
distinct() %>%
mutate(vaccine = paste0(pathogen, " (", vaccine_type, ")"))
# remove vaccine where all the participant have the same age
flag <- df.fig1c %>%
group_by(vaccine) %>%
summarize(nbUniqueAge = length(unique(age_imputed))) %>%
filter(nbUniqueAge > 1)
df.fig1c <- filter(df.fig1c, vaccine %in% flag$vaccine)
ggplot(data = df.fig1c,
mapping = aes(x = vaccine,
y = age_imputed)) +
geom_boxplot(fill = "transparent", outlier.color = "transparent") +
geom_beeswarm(mapping = aes(color = vaccine, shape = y_chrom_present),
cex = 0.3) +
scale_color_manual(name = "Pathogen (Vaccine type)",
values = vaccine2color) +
labs(x = "Pathogen (Vaccine type)", y = "Age") +
theme_bw() +
theme(panel.grid = element_blank(),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
legend.text = element_text(size = 7),
legend.key.height = unit(0.02, units = "npc"),
legend.key.width = unit(0.015, units = "npc")) +
guides(fill = guide_legend(ncol = 1))
# test for difference between vaccine
kruskal.test(formula = age_imputed ~ vaccine, data = df.fig1c)
##
## Kruskal-Wallis rank sum test
##
## data: age_imputed by vaccine
## Kruskal-Wallis chi-squared = 110.1, df = 7, p-value < 2.2e-16
Author: Slim Fourati
em.fig1d <- exprs(young.NoResp)
pd.fig1d <- pData(young.NoResp) %>%
mutate(Vaccine = paste0(pathogen, " (", vaccine_type, ")")) %>%
select(uid, y_chrom_present, ethnicity, Vaccine, study_time_collected,
age_imputed) %>%
rename(
Ychrom = y_chrom_present,
Ethnicity = ethnicity,
Timepoint = study_time_collected,
Age = age_imputed) %>%
column_to_rownames(var = "uid")
pd.fig1d <- pd.fig1d[colnames(em.fig1d), ]
df.fig1d <- data.frame(explained = as.vector(fit),
effect = names(fit)) %>%
arrange(explained) %>%
mutate(effect = factor(effect, levels = effect))
ggplot(data = df.fig1d,
mapping = aes(x = effect, y = explained)) +
geom_bar(stat = "identity") +
geom_text(aes(label = signif(explained, digits = 3)),
nudge_y = 0.01,
size = 3) +
labs(x = NULL, y = "Proportion of the variance explained") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1))
Author: Slim Fourati
Pre-Processing: Microarray probes may have different affinities for their target mRNA and thus the itensities can’t be compared between probes inside the same sample. A way to correct for that is to scale each probe to a mean of 0 and a standard-deviation of 1.
The following looks only at pre-vaccination timepoints
sleaFile <- file.path(dataCacheDir, "sleaSet.rds")
sleaCached <- file.exists(sleaFile)
if(sleaCached){
sleaSet <- readRDS(sleaFile)
}
batch <- interaction(young.NoResp$study_accession,
young.NoResp$matrix,
drop = TRUE)
scaledMat <- by(data = t(exprs(young.NoResp)),
INDICES = batch,
FUN = function(x) scale(x)) %>%
do.call(what = rbind) %>%
t()
scaledMat <- scaledMat[, sampleNames(young.NoResp)]
# SLEA (Ref: Lopez-Bigas N. et al. (2008) Genome Biol.)
B <- 100
flag <- lapply(all_genesets, FUN = intersect, y = rownames(scaledMat)) %>%
sapply(FUN = length)
zscoreMat <- sapply(all_genesets, FUN = function(gs) {
gs <- intersect(gs, rownames(scaledMat))
ngenes <- length(gs)
mu <- colMeans(scaledMat[gs, , drop = FALSE], na.rm = TRUE)
muPermut <- mclapply(1:B, FUN = function(seed) {
set.seed(seed = seed)
muHat <- colMeans(scaledMat[sample.int(nrow(scaledMat), ngenes), , drop = FALSE],
na.rm = TRUE)
return(value = muHat)
})
muPermut <- do.call(what = rbind, args = muPermut)
zscore <- (mu - colMeans(muPermut, na.rm = TRUE)) /
apply(muPermut, MARGIN = 2, FUN = sd, na.rm = TRUE)
return(value = zscore)
})
sleaSet <- ExpressionSet(assayData = t(zscoreMat),
phenoData = AnnotatedDataFrame(pData(young.NoResp)))
saveRDS(sleaSet, file = sleaFile)
inflamGS <- c("HALLMARK_INFLAMMATORY_RESPONSE",
"HALLMARK_COMPLEMENT",
"HALLMARK_IL6_JAK_STAT3_SIGNALING",
"HALLMARK_TNFA_SIGNALING_VIA_NFKB")
Author: Slim Fourati
zscoreTemp <- exprs(sleaSet)[, sleaSet$study_time_collected <= 0]
# remove genesets with missing symbols in virtual study
zscoreTemp <- zscoreTemp[rowMeans(is.na(zscoreTemp)) < 1, ]
# for representation purpsis only present top 200 varying genesets
varLS <- apply(zscoreTemp, MARGIN = 1, FUN = var)
zscoreTemp <- zscoreTemp[order(varLS, decreasing = TRUE)[1:200], ]
# cutree into three groups (see gap statistic analysis
set.seed(seed = 23)
tree_col <- kmeans(x = t(zscoreTemp), centers = 3)
gr <- tree_col$cluster
# relabel cluster based on inflammatory genesets
df.fig2 <- data.frame(mu = colMeans(zscoreTemp[inflamGS, ])) %>%
rownames_to_column()
df.fig2 <- data.frame(grn = gr) %>%
rownames_to_column() %>%
merge(x = df.fig2) %>%
merge(y = pData(sleaSet),
by.x = "rowname",
by.y = "uid")
df.fig2 %>%
group_by(grn) %>%
summarize(q2 = median(mu)) %>%
ungroup() %>%
mutate(gr = ifelse(q2 %in% min(q2),
yes = "low",
no = NA),
gr = ifelse(q2 %in% max(q2) & is.na(gr),
yes = "high",
no = gr),
gr = ifelse(is.na(gr),
yes = "mid",
no = gr),
gr = factor(gr, levels = c("low", "mid", "high"))) %>%
merge(x = df.fig2,
by = "grn") %>%
column_to_rownames(var = "rowname") -> df.fig2
df.fig2 <- df.fig2[colnames(zscoreTemp), ]
# Remove names of unwanted cells in plot
rowLabels <- rownames(zscoreTemp)
targetCellNames <- "B.cell|E2F|T.cell|MYC| NK| Interferon|STAT|migration|Monocytes| DC|Neutrophiles"
unwantedCells <- !grepl(pattern = targetCellNames,
rownames(zscoreTemp),
ignore.case = TRUE)
rowLabels[ unwantedCells ] <- ""
# generate heatmap annotation
ha <- df.fig2 %>%
rownames_to_column() %>%
mutate(inflam.gs = findInterval(mu,
vec = quantile(mu,
probs = c(0, 0.33, 0.66, 1)),
rightmost.closed = TRUE),
inflam.gs = paste0("tier", inflam.gs)) %>%
select(rowname, inflam.gs) %>%
column_to_rownames() %>%
.[colnames(zscoreTemp), , drop = FALSE] %>%
HeatmapAnnotation(df = .,
col = list(inflam.gs = c(tier1 = "yellow",
tier2 = "orange",
tier3 = "red")))
Heatmap(matrix = zscoreTemp,
row_names_side = "left",
show_column_names = FALSE,
column_split = df.fig2$gr,
show_row_dend = FALSE,
name = "SLEA z-score",
row_labels = rowLabels,
row_names_gp = gpar(fontsize = 8),
top_annotation = ha)
Author: Slim Fourati
clusGapFile <- file.path(dataCacheDir, "clusGap.rds")
if(!file.exists(clusGapFile)){
set.seed(seed = 23)
fit <- clusGap(t(zscoreTemp), kmeans, K.max = 10)
saveRDS(fit, file = clusGapFile)
}else{
fit <- readRDS(clusGapFile)
}
plot(fit)
print(fit, method = "firstSEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = t(zscoreTemp), FUNcluster = kmeans, K.max = 10)
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 1
## logW E.logW gap SE.sim
## [1,] 9.092772 9.773302 0.6805300 0.005232080
## [2,] 8.983406 9.661745 0.6783390 0.003892934
## [3,] 8.935179 9.626523 0.6913443 0.004567634
## [4,] 8.901936 9.596598 0.6946614 0.004983855
## [5,] 8.883434 9.578408 0.6949743 0.004300859
## [6,] 8.870609 9.563215 0.6926062 0.004140528
## [7,] 8.849370 9.550025 0.7006544 0.003918490
## [8,] 8.839935 9.538047 0.6981121 0.004245120
## [9,] 8.828216 9.526725 0.6985095 0.003917492
## [10,] 8.815861 9.516875 0.7010137 0.003458210
# continuous variable
df.fig2 %>%
select(gr,
age_reported,
age_imputed) %>%
pivot_longer(cols = -gr) %>%
group_by(name) %>%
do(p = kruskal.test(formula = value~gr,
data = .)$p.value) %>%
ungroup() %>%
mutate(p = unlist(p))
# categorical variable
df.fig2 %>%
select(gr,
study_time_collected,
gender, # TODO: Use ychrom_present?
race,
ethnicity,
exposure_material_reported,
matrix,
study_accession,
Hispanic,
White,
Asian,
Black,
cell_type,
cohort,
featureSetName,
featureSetName2,
featureSetVendor,
vaccine,
vaccine_type,
adjuvant,
pathogen) %>%
sapply(FUN = as.character) %>%
as.data.frame() %>%
pivot_longer(cols = -gr) %>%
group_by(name) %>%
do(p = {
fit <- try(fisher.test(table(.$value, .$gr),
simulate.p.value = TRUE),
silent = TRUE)
if ("try-error" %in% class(fit)) {
NA
} else {
fit$p.value
}
}) %>%
ungroup() %>%
mutate(p = unlist(p), adj.p = p.adjust(p, method = "BH"))
Author: Slim Fourati
df.fig2sb <- df.fig2 %>%
filter(mu < 4.5 & mu > -4.5) %>% # remove outlier
filter(duplicated(participant_id) |
duplicated(participant_id, fromLast = TRUE))
df.fig2sb %>%
filter(study_accession %in% c("SDY80", "SDY180")) %>%
arrange(participant_id, study_time_collected) %>%
wilcox.test(formula = mu ~ study_time_collected,
data = .,
paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: mu by study_time_collected
## V = 716, p-value = 0.00831
## alternative hypothesis: true location shift is not equal to 0
df.fig2sb %>%
filter(study_accession %in% c("SDY80", "SDY180")) %>%
select(participant_id, gr, mu, study_time_collected) %>%
pivot_wider(names_from = study_time_collected,
values_from = c(gr, mu)) %>%
mutate(stable = (`gr_-7` == `gr_0`) | abs(`mu_0`-`mu_-7`)< 1) %>%
group_by(stable) %>%
summarize(n = n())
df.fig2sb %>%
filter(study_accession %in% c("SDY80", "SDY180")) %>%
select(participant_id, gr, mu, study_time_collected) %>%
pivot_wider(names_from = study_time_collected,
values_from = c(gr, mu)) %>%
mutate(stable = (`gr_-7` == `gr_0`) | abs(`mu_0`-`mu_-7`)< 1) %>%
select(participant_id, stable) %>%
merge(x = df.fig2sb, by = "participant_id", all.x = TRUE) -> df.fig2sb
ggplot(data = df.fig2sb,
mapping = aes(x = factor(study_time_collected), y = mu)) +
geom_point(mapping = aes(color = gr)) +
geom_line(mapping = aes(group = participant_id,
linetype = stable)) +
scale_color_discrete(name = "pre-vax cluster") +
labs(x = "study_time_collected",
y = "Inflammatory genesets (SLEA z-score)") +
theme_bw()
getSleaPDSubset.byGS <- function(geneset){
pd <- pData(sleaSet)
pd$mu <- colMeans(exprs(sleaSet)[geneset, ])
pd <- pd[ !is.na(pd$gr), ]
}
getSleaPDSubset.byGenes <- function(genes){
df <- exprs(young.NoResp)[genes, ] %>%
as.data.frame() %>%
rownames_to_column(var = "gene") %>%
pivot_longer(cols = -gene, names_to = "uid") %>%
merge(y = pData(sleaSet), by = "uid") %>%
filter(!is.na(gr))
}
plotSLEASubsetByTime.byGS <- function(pd, ylabel){
ggplot(data = pd,
mapping = aes(x = study_time_collected, y = mu, color = gr)) +
geom_line(mapping = aes(group = participant_id),
alpha = 0.1) +
geom_hline(yintercept = 0, color = "grey", size = 2) +
geom_smooth(method = "loess", formula = "y~x", color = "black") +
facet_wrap(facets = ~gr) +
scale_x_continuous(limits = c(0, 28)) +
labs(y = ylabel,
x = "Time after vaccination (days)") +
theme_bw() +
theme(axis.text = element_text(size = 15),
strip.text = element_text(size = 20),
axis.title.x = element_text(size = 20),
legend.pos = "none",
panel.grid = element_blank())
}
plotSLEASubsetByTime.byGene <- function(df){
q2DF <- df %>%
group_by(gene) %>%
summarize(q2 = median(value))
ggplot(data = df,
mapping = aes(x = study_time_collected, y = value, color = gr)) +
geom_line(mapping = aes(group = participant_id),
alpha = 0.1) +
geom_hline(data = q2DF,
mapping = aes(yintercept = q2),
color = "grey") +
geom_smooth(method = "loess", formula = "y~x", color = "black") +
facet_grid(facets = gene~gr) +
scale_x_continuous(limits = c(0, 28)) +
scale_y_continuous(limits = c(7, 10.5)) +
labs(y = "log2 intensity (ComBat)") +
theme_bw() +
theme(axis.text = element_text(size = 15),
strip.text = element_text(size = 20),
axis.title.x = element_text(size = 20),
legend.pos = "none",
panel.grid = element_blank())
}
Author: Slim Fourati
pd.fig3a <- getSleaPDSubset.byGS(inflamGS)
plotSLEASubsetByTime.byGS(pd.fig3a, "Inflammatory pathways (SLEA z-score)")
Author: Slim Fourati
inflamGenes <- c("IL1B", "NLRP3", "TNFAIP2")
df.figs3a <- getSleaPDSubset.byGenes(inflamGenes)
plotSLEASubsetByTime.byGene(df.figs3a)
Author: Slim Fourati
interferonGS <- c("HALLMARK_INTERFERON_ALPHA_RESPONSE",
"HALLMARK_INTERFERON_GAMMA_RESPONSE")
pd.figs3b <- getSleaPDSubset.byGS(interferonGS)
plotSLEASubsetByTime.byGS(pd.figs3b, "Interferon-stimulated genes (SLEA z-score)")
Author: Slim Fourati
isgGenes <- c("STAT2", "IRF9", "MX1")
df.figs3b2 <- getSleaPDSubset.byGenes(isgGenes)
plotSLEASubsetByTime.byGene(df.figs3b2)
Author: Slim Fourati
bcellGS <- c("enriched in B cells (I) (M47.0)",
"enriched in B cells (V) (M47.4)",
"plasma cells & B cells, immunoglobulins (M156.0)")
pd.fig3c <- getSleaPDSubset.byGS(bcellGS)
plotSLEASubsetByTime.byGS(pd.fig3c, "B cells (SLEA z-score)")
Author: Slim Fourati
bcellGenes <- c("CD79A", "CD79B", "BANK1")
df.figs3c <- getSleaPDSubset.byGenes(bcellGenes)
plotSLEASubsetByTime.byGene(df.figs3c)
Author: Slim Fourati
pd.fig4a <- pData(sleaSet)[sleaSet$study_time_collected <= 0, ]
df.fig4a <- pd.fig4a %>%
merge(y = distinct(select(pData(young.WithResp),
participant_id, MFC, MFC_p30))) %>%
group_by(study_accession) %>%
mutate(sMFC = scale(MFC),
vaccine = paste0(pathogen,".", vaccine_type),
vaccine = ifelse(pathogen %in% "Meningococcus",
yes = "Meningococcus",
no = vaccine))
ggplot(data = df.fig4a,
mapping = aes(x = factor(gr), y = sMFC)) +
geom_beeswarm(cex = 1.1, size = 0.75) +
geom_boxplot(outlier.color = "transparent",
fill = "transparent",
col = "red") +
labs(x = "Prevax inflammatory cluster") +
theme_bw()
kruskal.test(x = df.fig4a$sMFC, g = df.fig4a$gr)
##
## Kruskal-Wallis rank sum test
##
## data: df.fig4a$sMFC and df.fig4a$gr
## Kruskal-Wallis chi-squared = 3.0735, df = 2, p-value = 0.2151
pairwise.wilcox.test(x = df.fig4a$sMFC,
g = df.fig4a$gr,
p.adjust.method = "none") %>%
print()
##
## Pairwise comparisons using Wilcoxon rank sum test
##
## data: df.fig4a$sMFC and df.fig4a$gr
##
## low mid
## mid 0.339 -
## high 0.088 0.380
##
## P value adjustment method: none
Author: Slim Fourati
df.fig4a %>%
filter(age_imputed < 50 & gr %in% c("low", "high")) %>%
group_by(vaccine) %>%
do(n = nrow(.),
estimate = {
fit <- try(wilcox.test(formula = sMFC~gr,
data = .,
conf.int = TRUE),
silent = TRUE)
if ("try-error" %in% class(fit)) {
NA
} else {
fit$estimate
}
},
p = {
fit <- try(wilcox.test(formula = sMFC~gr, data = ., conf.int = TRUE),
silent = TRUE)
if ("try-error" %in% class(fit)) {
NA
} else {
fit$p.value
}
}) %>%
mutate(n = unlist(n),
estimate = unlist(estimate),
p = unlist(p)) %>%
print()
## # A tibble: 9 x 4
## # Rowwise:
## vaccine n estimate p
## <chr> <int> <dbl> <dbl>
## 1 Hepatitis B.Inactivated 14 -0.0000656 0.940
## 2 Influenza.Inactivated 365 -0.226 0.0720
## 3 Influenza.Live attenuated 21 -0.0000207 0.373
## 4 Meningococcus 22 -0.0579 0.794
## 5 Pneumococcus.Polysaccharide 6 0.695 0.0902
## 6 Smallpox.Live attenuated 3 1.22 0.667
## 7 Tuberculosis.Recombinant Viral Vector 8 0.121 1
## 8 Varicella Zoster.Live attenuated 14 -0.171 0.606
## 9 Yellow Fever.Live attenuated 65 -0.0152 0.703
Author: Slim Fourati
plotTemp <- filter(df.fig4a, !(vaccine %in% c("Pneumococcus.Polysaccharide",
"Smallpox.Live attenuated",
"Tuberculosis.Recombinant Viral Vector")))
ggplot(data = filter(plotTemp, age_imputed < 50),
mapping = aes(x = factor(gr), y = sMFC)) +
geom_beeswarm() +
geom_boxplot(outlier.color = "transparent",
fill = "transparent",
col = "red") +
labs(x = "Prevax inflammatory cluster",
title = "age < 50") +
facet_wrap(facets = ~ifelse(test= vaccine %in% "Influenza.Inactivated",
yes = "Influenza.Inactivated",
no = "others")) +
theme_bw()
filter(plotTemp, vaccine %in% "Influenza.Inactivated") %>%
(function(p) { pairwise.wilcox.test(x = p$sMFC,
g = p$gr,
p.adjust.method = "none")}) %>%
print()
##
## Pairwise comparisons using Wilcoxon rank sum test
##
## data: p$sMFC and p$gr
##
## low mid
## mid 0.982 -
## high 0.072 0.087
##
## P value adjustment method: none
filter(plotTemp, vaccine != "Influenza.Inactivated") %>%
(function(p) { pairwise.wilcox.test(x = p$sMFC,
g = p$gr,
p.adjust.method = "none")}) %>%
print()
##
## Pairwise comparisons using Wilcoxon rank sum test
##
## data: p$sMFC and p$gr
##
## low mid
## mid 0.036 -
## high 0.182 0.431
##
## P value adjustment method: none